# load packages
library(tidyverse)
library(data.table)
library(leaps)
library(glmnet)
library(caret)

Intro

Note: If code is repeated, it is only commented the first time

# load and preview data
dt <- data.table(ISLR2::Boston)
head(dt)

The ISLR2::Boston dataset contains “A data set containing housing values in 506 suburbs of Boston.” If you want to learn more, I suggest visiting https://rdocumentation.org/packages/ISLR2/versions/1.3-1/topics/Boston.

Variable Description Type
crim per capita crime rate by town. double
zn proportion of residential land zoned for lots over 25,000 sq.ft. double
indus proportion of non-retail business acres per town. double
chas Charles River dummy variable (=1 if tract bounds river; 0 otherwise). integer (boolean)
nox nitrogen oxides concentration (parts per 10 million). double
rm average number of rooms per dwelling. double
age proportion of owner-occupied units built prior to 1940 double
dis weighted mean of distances to five Boston employment centres. double
rad index of accessibility to radial highways integer
tax full-value property-tax rate per $10,000. double
ptratio pupil-teacher ratio by town. double
lstat lower status of the population (percent). double
medv median value of owner-occupied homes in $1000s. double

Summary Stats

# get summary stats
summary(dt)
##       crim                zn             indus            chas        
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   Min.   :0.00000  
##  1st Qu.: 0.08205   1st Qu.:  0.00   1st Qu.: 5.19   1st Qu.:0.00000  
##  Median : 0.25651   Median :  0.00   Median : 9.69   Median :0.00000  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14   Mean   :0.06917  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10   3rd Qu.:0.00000  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74   Max.   :1.00000  
##       nox               rm             age              dis        
##  Min.   :0.3850   Min.   :3.561   Min.   :  2.90   Min.   : 1.130  
##  1st Qu.:0.4490   1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100  
##  Median :0.5380   Median :6.208   Median : 77.50   Median : 3.207  
##  Mean   :0.5547   Mean   :6.285   Mean   : 68.57   Mean   : 3.795  
##  3rd Qu.:0.6240   3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188  
##  Max.   :0.8710   Max.   :8.780   Max.   :100.00   Max.   :12.127  
##       rad              tax           ptratio          lstat      
##  Min.   : 1.000   Min.   :187.0   Min.   :12.60   Min.   : 1.73  
##  1st Qu.: 4.000   1st Qu.:279.0   1st Qu.:17.40   1st Qu.: 6.95  
##  Median : 5.000   Median :330.0   Median :19.05   Median :11.36  
##  Mean   : 9.549   Mean   :408.2   Mean   :18.46   Mean   :12.65  
##  3rd Qu.:24.000   3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:16.95  
##  Max.   :24.000   Max.   :711.0   Max.   :22.00   Max.   :37.97  
##       medv      
##  Min.   : 5.00  
##  1st Qu.:17.02  
##  Median :21.20  
##  Mean   :22.53  
##  3rd Qu.:25.00  
##  Max.   :50.00
# check for any correlation coefficients over .75
df <- dewey::ifelsedata(data.frame(round(cor(dt), 3)), 
                        .75, "x >= y & x != 1", matchCols = FALSE)
# set the row names
rownames(df) <- colnames(df)
# preview the correlation matrix
df
# produce pairs plots with correlation coefficient
GGally::ggpairs(dt[, c(1:6)], progress = FALSE)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

GGally::ggpairs(dt[, c(1, 7:12)], progress = FALSE)

The first output is the basic summary statistics, the second is a correlation matrix, but only keeping values above \(.75\) There’s nothing crazy with these numbers. It is weird that only tax and rad are correlated above \(.75\), but then again highways decrease property taxes or something. idk. I’m not an urban anything.

Splitting the data

# set the seed
set.seed(123)

# randomly generate TRUE/FALSE to split the data at an 80/20 split
rowPicker <- sample(c(TRUE, FALSE), nrow(dt), replace = TRUE, prob = c(.8, .2))

# split the data
trainDt <- dt[rowPicker]
testDt <- dt[!rowPicker]

Subset selection

Normal

# run `regsubsets`
best_fit <- regsubsets(crim ~ ., trainDt, nvmax = 12)
best_summary <- summary(best_fit)

# create a data.table of the BIC, CP, and R^2
data.table("BIC" = best_summary$bic,
           "Cp" = best_summary$cp,
           "r2" = best_summary$adjr2)[order(r2 * -1, BIC, Cp)]
# show the CP chart in two formats side-by-side
par(mfrow = c(1,2))
plot(best_summary$cp, xlab = "number of features", ylab = "cp")
plot(best_fit, scale = "Cp")

# show the BIC chart in two formats side-by-side
par(mfrow = c(1, 2))
plot(best_summary$bic, xlab = "number of features", ylab = "bic")
plot(best_fit, scale = "bic")

# save the best formula
normal <- as.formula("crim ~ + zn + nox + dis + rad + ptratio + lstat + medv")

Forward

best_fit <- regsubsets(crim ~ ., trainDt, nvmax = 12, method = "forward")
best_summary <- summary(best_fit)

data.table("BIC" = best_summary$bic,
           "Cp" = best_summary$cp,
           "r2" = best_summary$adjr2)[order(r2 * -1, BIC, Cp)]
par(mfrow = c(1,2))
plot(best_summary$cp, xlab = "number of features", ylab = "cp")
plot(best_fit, scale = "Cp")

par(mfrow = c(1, 2))
plot(best_summary$bic, xlab = "number of features", ylab = "bic")
plot(best_fit, scale = "bic")

forward <- as.formula("crim ~ + rad + lstat")

Backward

best_fit <- regsubsets(crim ~ ., trainDt, nvmax = 12, method = "backward")
best_summary <- summary(best_fit)

data.table("BIC" = best_summary$bic,
           "Cp" = best_summary$cp,
           "r2" = best_summary$adjr2)[order(r2 * -1, BIC, Cp)]
par(mfrow = c(1,2))
plot(best_summary$cp, xlab = "number of features", ylab = "cp")
plot(best_fit, scale = "Cp")

par(mfrow = c(1, 2))
plot(best_summary$bic, xlab = "number of features", ylab = "bic")
plot(best_fit, scale = "bic")

backward <- as.formula("crim ~ + zn + dis + rad + medv")

Final selection

# run regsearch to find the best model
regs <- dewey::regsearch(trainDt, "crim", c(colnames(trainDt[, !c("crim")]), "lstat*rad"), 1, 12, "gaussian", 0, FALSE, TRUE)
## [1] "Assembling regresions..."
## [1] "Creating 8190 formulas. Please be patient, this may take a while."
## [1] "Creating regressions..."
## [1] "Running 5119 regressions. Please be patient, this may take a while."
## [1] "Running regressions..."
regs
# select the best model and save it
dewey <- as.formula("crim ~ + lstat + rad")
# create a vector of all formulas
forms <- c(normal, forward, backward, dewey)

# lapply(forms, function(x) { 
#   print(x)
#   summary(lm(formula = x, testDt))
#   })

# print the formula and summary stats for each one
for(x in forms) {
  print(x)
  print(summary(lm(formula = x, testDt)))
  }
## crim ~ +zn + nox + dis + rad + ptratio + lstat + medv
## 
## Call:
## lm(formula = x, data = testDt)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.2349 -1.2980 -0.0535  0.8753 13.9012 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.63421    5.93751   0.612    0.542    
## zn           0.02007    0.02072   0.969    0.335    
## nox         -5.32924    5.01379  -1.063    0.291    
## dis         -0.29191    0.24244  -1.204    0.232    
## rad          0.43554    0.04809   9.057 3.77e-14 ***
## ptratio     -0.09728    0.18226  -0.534    0.595    
## lstat        0.12934    0.08337   1.551    0.124    
## medv        -0.05048    0.06593  -0.766    0.446    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.841 on 86 degrees of freedom
## Multiple R-squared:  0.7015, Adjusted R-squared:  0.6772 
## F-statistic: 28.87 on 7 and 86 DF,  p-value: < 2.2e-16
## 
## crim ~ +rad + lstat
## 
## Call:
## lm(formula = x, data = testDt)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.3812 -0.9701  0.1127  0.7752 14.2313 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.29654    0.64550  -5.107 1.79e-06 ***
## rad          0.41729    0.03731  11.186  < 2e-16 ***
## lstat        0.15302    0.04708   3.250  0.00162 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.811 on 91 degrees of freedom
## Multiple R-squared:  0.6909, Adjusted R-squared:  0.6841 
## F-statistic: 101.7 on 2 and 91 DF,  p-value: < 2.2e-16
## 
## crim ~ +zn + dis + rad + medv
## 
## Call:
## lm(formula = x, data = testDt)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.1607 -0.9204 -0.0167  0.4745 14.2917 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.02802    1.28714   1.576   0.1187    
## zn           0.02386    0.01930   1.236   0.2196    
## dis         -0.31035    0.18399  -1.687   0.0952 .  
## rad          0.40829    0.04075  10.020 2.92e-16 ***
## medv        -0.10350    0.03973  -2.605   0.0108 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.846 on 89 degrees of freedom
## Multiple R-squared:  0.6899, Adjusted R-squared:  0.676 
## F-statistic:  49.5 on 4 and 89 DF,  p-value: < 2.2e-16
## 
## crim ~ +lstat + rad
## 
## Call:
## lm(formula = x, data = testDt)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.3812 -0.9701  0.1127  0.7752 14.2313 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.29654    0.64550  -5.107 1.79e-06 ***
## lstat        0.15302    0.04708   3.250  0.00162 ** 
## rad          0.41729    0.03731  11.186  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.811 on 91 degrees of freedom
## Multiple R-squared:  0.6909, Adjusted R-squared:  0.6841 
## F-statistic: 101.7 on 2 and 91 DF,  p-value: < 2.2e-16

10-fold Validation

# set the trainControl method to 10-fold
train_control <- trainControl(method = "cv", number = 10)

# train the model
model <- train(normal, data = trainDt, method = "lm", 
                     trControl = train_control)
# use the model to predict for the test data
prediction_ridge <- predict(model, newdata = testDt)
# save the stats for the current model
Normal <- c("Type" = "10 Fold",
            "R_Square" = R2(prediction_ridge, testDt$crim),
            "RMSE" = RMSE(prediction_ridge, testDt$crim),
            "MAE" = MAE(prediction_ridge, testDt$crim))

train_control <- trainControl(method ="cv", number = 10)
model <- train(forward, data = trainDt, method = "lm", 
                     trControl = train_control)
prediction_ridge <- predict(model, newdata = testDt)
Forward <- c("Type" = "10 Fold",
             "R_Square" = R2(prediction_ridge, testDt$crim),
             "RMSE" = RMSE(prediction_ridge, testDt$crim),
             "MAE" = MAE(prediction_ridge, testDt$crim))

model <- train(backward, data = trainDt, method = "lm", 
                     trControl = train_control)
prediction_ridge <- predict(model, newdata = testDt)
Backward <- c("Type" = "10 Fold",
              "R_Square" = R2(prediction_ridge, testDt$crim),
              "RMSE" = RMSE(prediction_ridge, testDt$crim),
              "MAE" = MAE(prediction_ridge, testDt$crim))


model <- train(dewey, data = trainDt, method = "lm", 
                     trControl = train_control)
prediction_ridge <- predict(model, newdata = testDt)
Dewey <- c("Type" = "10 Fold",
           "R_Square" = R2(prediction_ridge, testDt$crim),
           "RMSE" = RMSE(prediction_ridge, testDt$crim),
           "MAE" = MAE(prediction_ridge, testDt$crim))

# bind all the model results into a data.frame
regStats <- rbind(Normal, Forward, Backward, Dewey)

LOOCV

train_control <- trainControl(method = "LOOCV")


model_ridge <- train(normal, data = trainDt, method = "lm", 
                     trControl = train_control)
prediction_ridge <- predict(model_ridge, newdata = testDt)
Normal <- c("Type" = "LOOCV",
            "R_Square" = R2(prediction_ridge, testDt$crim),
            "RMSE" = RMSE(prediction_ridge, testDt$crim),
            "MAE" = MAE(prediction_ridge, testDt$crim))

train_control <- trainControl(method ="cv", number = 10)
model_ridge <- train(forward, data = trainDt, method = "lm", 
                     trControl = train_control)
prediction_ridge <- predict(model_ridge, newdata = testDt)
Forward <- c("Type" = "LOOCV",
             "R_Square" = R2(prediction_ridge, testDt$crim),
             "RMSE" = RMSE(prediction_ridge, testDt$crim),
             "MAE" = MAE(prediction_ridge, testDt$crim))

model_ridge <- train(backward, data = trainDt, method = "lm", 
                     trControl = train_control)
prediction_ridge <- predict(model_ridge, newdata = testDt)
Backward <- c("Type" = "LOOCV",
              "R_Square" = R2(prediction_ridge, testDt$crim),
              "RMSE" = RMSE(prediction_ridge, testDt$crim),
              "MAE" = MAE(prediction_ridge, testDt$crim))


model_ridge <- train(dewey, data = trainDt, method = "lm", 
                     trControl = train_control)
prediction_ridge <- predict(model_ridge, newdata = testDt)
Dewey <- c("Type" = "LOOCV",
           "R_Square" = R2(prediction_ridge, testDt$crim),
           "RMSE" = RMSE(prediction_ridge, testDt$crim),
           "MAE" = MAE(prediction_ridge, testDt$crim))


regStats <- rbind(regStats, Normal, Forward, Backward, Dewey)
# preview all the regressions so far
regStats
##          Type      R_Square            RMSE               MAE               
## Normal   "10 Fold" "0.672415531514889" "3.76250483312505" "2.84888093130358"
## Forward  "10 Fold" "0.688825316227434" "3.41610927790962" "2.34648962836827"
## Backward "10 Fold" "0.676189775369158" "3.62179308014776" "2.61017890603348"
## Dewey    "10 Fold" "0.688825316227434" "3.41610927790962" "2.34648962836827"
## Normal   "LOOCV"   "0.672415531514889" "3.76250483312505" "2.84888093130358"
## Forward  "LOOCV"   "0.688825316227434" "3.41610927790962" "2.34648962836827"
## Backward "LOOCV"   "0.676189775369158" "3.62179308014776" "2.61017890603348"
## Dewey    "LOOCV"   "0.688825316227434" "3.41610927790962" "2.34648962836827"

Again, regsubsets produces slightly better models, but mine is almost as good and is more parsimonious. As lstat increases by one, crim increases by \(.237\) and when rad increases by one, crim increases by \(.522\).

Ridge and Lasso Regression

10-fold Validation

# First define the traincontrol to specify k-fold
train_control <- trainControl(method ="cv", number = 10)

# define the lambda
lambda <- 10^seq(-2, 5, length = 1000)

Ridge Regression

model_ridge <- train(crim ~ ., data = trainDt, method = "glmnet", 
                     trControl = train_control, 
                     tuneGrid = expand.grid(alpha = 0, lambda = lambda))
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
summary(model_ridge)
##             Length Class      Mode     
## a0           100   -none-     numeric  
## beta        1200   dgCMatrix  S4       
## df           100   -none-     numeric  
## dim            2   -none-     numeric  
## lambda       100   -none-     numeric  
## dev.ratio    100   -none-     numeric  
## nulldev        1   -none-     numeric  
## npasses        1   -none-     numeric  
## jerr           1   -none-     numeric  
## offset         1   -none-     logical  
## call           5   -none-     call     
## nobs           1   -none-     numeric  
## lambdaOpt      1   -none-     numeric  
## xNames        12   -none-     character
## problemType    1   -none-     character
## tuneValue      2   data.frame list     
## obsLevels      1   -none-     logical  
## param          0   -none-     list
# get the coefficients
coef(model_ridge$finalModel, model_ridge$bestTune$lambda)
## 13 x 1 sparse Matrix of class "dgCMatrix"
##                        s1
## (Intercept)  7.8457780777
## zn           0.0394541881
## indus       -0.0867310441
## chas        -0.9298187135
## nox         -6.7774611223
## rm           0.5824520486
## age         -0.0004376844
## dis         -0.8984261957
## rad          0.4491764668
## tax          0.0050581097
## ptratio     -0.2219848644
## lstat        0.1658805138
## medv        -0.1873594453
prediction_ridge <- predict(model_ridge, newdata = testDt)

Ridge <- c("Type" = "10 Fold",
           "R_Square" = R2(prediction_ridge, testDt$crim),
           "RMSE" = RMSE(prediction_ridge, testDt$crim),
           "MAE" = MAE(prediction_ridge, testDt$crim))

LASSO Regression

model_lasso <- train(crim ~ ., data = trainDt, method = "glmnet", 
                     trControl = train_control, 
                     tuneGrid = expand.grid(alpha = 1, lambda = lambda))
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
summary(model_lasso)
##             Length Class      Mode     
## a0           77    -none-     numeric  
## beta        924    dgCMatrix  S4       
## df           77    -none-     numeric  
## dim           2    -none-     numeric  
## lambda       77    -none-     numeric  
## dev.ratio    77    -none-     numeric  
## nulldev       1    -none-     numeric  
## npasses       1    -none-     numeric  
## jerr          1    -none-     numeric  
## offset        1    -none-     logical  
## call          5    -none-     call     
## nobs          1    -none-     numeric  
## lambdaOpt     1    -none-     numeric  
## xNames       12    -none-     character
## problemType   1    -none-     character
## tuneValue     2    data.frame list     
## obsLevels     1    -none-     logical  
## param         0    -none-     list
coef(model_lasso$finalModel, model_lasso$bestTune$lambda)
## 13 x 1 sparse Matrix of class "dgCMatrix"
##                      s1
## (Intercept) 10.22150762
## zn           0.04097843
## indus       -0.04933820
## chas        -0.70294764
## nox         -6.44460113
## rm           0.33561020
## age          .         
## dis         -0.85830616
## rad          0.54947059
## tax          .         
## ptratio     -0.22309305
## lstat        0.13018587
## medv        -0.19062597
prediction_lasso <- predict(model_lasso, newdata = testDt)

LASSO <- c("Type" = "10 Fold",
           "R_Square" = R2(prediction_lasso, testDt$crim),
           "RMSE" = RMSE(prediction_lasso, testDt$crim),
           "MAE" = MAE(prediction_lasso, testDt$crim))

regStats <- rbind(regStats, Ridge, LASSO)

A quick linear model

model_linear <- train(crim ~ ., data = trainDt, 
                     method = "lm", 
                     metric = "Rsquared")
coef(model_linear$finalModel)
##   (Intercept)            zn         indus          chas           nox 
##  17.922807744   0.053237621  -0.072227157  -0.844467579 -12.940740846 
##            rm           age           dis           rad           tax 
##   0.791934168  -0.002455158  -1.290113308   0.619246825  -0.002140583 
##       ptratio         lstat          medv 
##  -0.400684841   0.142182254  -0.266084003
prediction_linear <- predict(model_linear, newdata = testDt)

Model Comparisons

# create data.frames to compare the results from the ridge, LASSO, and linear models
data.frame(
  ridge = as.data.frame.matrix(coef(model_ridge$finalModel, model_ridge$finalModel$lambdaOpt)),
  lasso = as.data.frame.matrix(coef(model_lasso$finalModel, model_lasso$finalModel$lambdaOpt)), 
  linear = (model_linear$finalModel$coefficients)
)
data.frame(
  ridge = as.data.frame.matrix(coef(model_ridge$finalModel, model_ridge$finalModel$lambdaOpt)),
  lasso = as.data.frame.matrix(coef(model_lasso$finalModel, model_lasso$finalModel$lambdaOpt)), 
  linear = (model_linear$finalModel$coefficients)
) %>%  
rename(ridge = s1, lasso = s1.1)
c("Ridge_Rsq" = R2(prediction_ridge, testDt$crim),
  "Lasso_Rsq" = R2(prediction_lasso, testDt$crim),
  "Linear_Rsq" = R2(prediction_linear, testDt$crim))
##  Ridge_Rsq  Lasso_Rsq Linear_Rsq 
##  0.6826257  0.6951040  0.6856090
# chart the coefficients as lambda increases
library(coefplot)

coefpath(model_ridge$finalModel)
plot(model_ridge$finalModel, xvar = "lambda", label = T)

LOOCV

# First define the traincontrol to specify k-fold
train_control <- trainControl(method ="LOOCV")

lambda <- 10^seq(-2, 5, length = 1000)

Ridge Regression

model_ridge <- train(crim ~ ., data = trainDt, method = "glmnet", 
                     trControl = train_control, 
                     tuneGrid = expand.grid(alpha = 0, lambda = lambda))
summary(model_ridge)
##             Length Class      Mode     
## a0           100   -none-     numeric  
## beta        1200   dgCMatrix  S4       
## df           100   -none-     numeric  
## dim            2   -none-     numeric  
## lambda       100   -none-     numeric  
## dev.ratio    100   -none-     numeric  
## nulldev        1   -none-     numeric  
## npasses        1   -none-     numeric  
## jerr           1   -none-     numeric  
## offset         1   -none-     logical  
## call           5   -none-     call     
## nobs           1   -none-     numeric  
## lambdaOpt      1   -none-     numeric  
## xNames        12   -none-     character
## problemType    1   -none-     character
## tuneValue      2   data.frame list     
## obsLevels      1   -none-     logical  
## param          0   -none-     list
coef(model_ridge$finalModel, model_ridge$bestTune$lambda)
## 13 x 1 sparse Matrix of class "dgCMatrix"
##                        s1
## (Intercept)  7.8457780777
## zn           0.0394541881
## indus       -0.0867310441
## chas        -0.9298187135
## nox         -6.7774611223
## rm           0.5824520486
## age         -0.0004376844
## dis         -0.8984261957
## rad          0.4491764668
## tax          0.0050581097
## ptratio     -0.2219848644
## lstat        0.1658805138
## medv        -0.1873594453
prediction_ridge <- predict(model_ridge, newdata = testDt)

Ridge <- c("Type" = "LOOCV", 
           "R_Square" = R2(prediction_ridge, testDt$crim),
           "RMSE" = RMSE(prediction_ridge, testDt$crim),
           "MAE" = MAE(prediction_ridge, testDt$crim))

LASSO Regression

model_lasso <- train(crim ~ ., data = trainDt, method = "glmnet", 
                     trControl = train_control, 
                     tuneGrid = expand.grid(alpha = 1, lambda = lambda))
summary(model_lasso)
##             Length Class      Mode     
## a0           77    -none-     numeric  
## beta        924    dgCMatrix  S4       
## df           77    -none-     numeric  
## dim           2    -none-     numeric  
## lambda       77    -none-     numeric  
## dev.ratio    77    -none-     numeric  
## nulldev       1    -none-     numeric  
## npasses       1    -none-     numeric  
## jerr          1    -none-     numeric  
## offset        1    -none-     logical  
## call          5    -none-     call     
## nobs          1    -none-     numeric  
## lambdaOpt     1    -none-     numeric  
## xNames       12    -none-     character
## problemType   1    -none-     character
## tuneValue     2    data.frame list     
## obsLevels     1    -none-     logical  
## param         0    -none-     list
coef(model_lasso$finalModel, model_lasso$bestTune$lambda)
## 13 x 1 sparse Matrix of class "dgCMatrix"
##                       s1
## (Intercept)  14.25157753
## zn            0.04702113
## indus        -0.07062848
## chas         -0.76177849
## nox         -10.21282576
## rm            0.57977335
## age           .         
## dis          -1.08925741
## rad           0.57151769
## tax           .         
## ptratio      -0.32248621
## lstat         0.13731756
## medv         -0.23010282
prediction_lasso <- predict(model_lasso, newdata = testDt)

LASSO <- c("Type" = "LOOCV",
           "R_Square" = R2(prediction_lasso, testDt$crim),
           "RMSE" = RMSE(prediction_lasso, testDt$crim),
           "MAE" = MAE(prediction_lasso, testDt$crim))

regStats <- rbind(regStats, Ridge, LASSO)

A quick linear model

model_linear <- train(crim ~ ., data = trainDt, 
                     method = "lm", 
                     metric = "Rsquared")
coef(model_linear$finalModel)
##   (Intercept)            zn         indus          chas           nox 
##  17.922807744   0.053237621  -0.072227157  -0.844467579 -12.940740846 
##            rm           age           dis           rad           tax 
##   0.791934168  -0.002455158  -1.290113308   0.619246825  -0.002140583 
##       ptratio         lstat          medv 
##  -0.400684841   0.142182254  -0.266084003
prediction_linear <- predict(model_linear, newdata = testDt)

Model Comparisons

data.frame(
  ridge = as.data.frame.matrix(coef(model_ridge$finalModel, model_ridge$finalModel$lambdaOpt)),
  lasso = as.data.frame.matrix(coef(model_lasso$finalModel, model_lasso$finalModel$lambdaOpt)), 
  linear = (model_linear$finalModel$coefficients)
)
data.frame(
  ridge = as.data.frame.matrix(coef(model_ridge$finalModel, model_ridge$finalModel$lambdaOpt)),
  lasso = as.data.frame.matrix(coef(model_lasso$finalModel, model_lasso$finalModel$lambdaOpt)), 
  linear = (model_linear$finalModel$coefficients)
) %>%  
rename(ridge = s1, lasso = s1.1)
c("Ridge_Rsq" = R2(prediction_ridge, testDt$crim),
  "Lasso_Rsq" = R2(prediction_lasso, testDt$crim),
  "Linear_Rsq" = R2(prediction_linear, testDt$crim))
##  Ridge_Rsq  Lasso_Rsq Linear_Rsq 
##  0.6826257  0.6899497  0.6856090
library(coefplot)

coefpath(model_ridge$finalModel)
plot(model_ridge$finalModel, xvar = "lambda", label = T)

Closing Thoughts

# convert the stats to an actual data.table and preview them
regStats
##          Type      R_Square            RMSE               MAE               
## Normal   "10 Fold" "0.672415531514889" "3.76250483312505" "2.84888093130358"
## Forward  "10 Fold" "0.688825316227434" "3.41610927790962" "2.34648962836827"
## Backward "10 Fold" "0.676189775369158" "3.62179308014776" "2.61017890603348"
## Dewey    "10 Fold" "0.688825316227434" "3.41610927790962" "2.34648962836827"
## Normal   "LOOCV"   "0.672415531514889" "3.76250483312505" "2.84888093130358"
## Forward  "LOOCV"   "0.688825316227434" "3.41610927790962" "2.34648962836827"
## Backward "LOOCV"   "0.676189775369158" "3.62179308014776" "2.61017890603348"
## Dewey    "LOOCV"   "0.688825316227434" "3.41610927790962" "2.34648962836827"
## Ridge    "10 Fold" "0.682625748162246" "3.57006978669577" "2.67489924552463"
## LASSO    "10 Fold" "0.695104028728848" "3.5079189807032"  "2.57309885980009"
## Ridge    "LOOCV"   "0.682625748162246" "3.57006978669577" "2.67489924552463"
## LASSO    "LOOCV"   "0.689949681902693" "3.64562535063843" "2.75537667733455"
regStats <- data.table("Model" = names(regStats[,1]), regStats)
regStats[order(-R_Square)]

LASSO regression with both 10-fold and LOOCV produced the best results. I did notice that they each produced a different \(\lambda\) value which I found mildly interesting. I’m assuming that this is largely just due to differences in how the model was developed. I’m pleasantly surprised to find my subset method ranked tied for third place with the forward subsets. I’m a little surprised that both models had the same \(R^2\), \(RMSE\), and \(MAE\) for both 10-fold and LOOCV. I am surprised to find Ridge towards the bottom of the list, but not at all surprised that backwards and normal subsets did even worse.